library(tidyverse)
library(plotly)
library(ggplot2)
library(knitr)
library(readr)
library(dplyr)
library(tidyr)
library(scales)
library(vcd)
library(tidyverse)
library(htmltools)

In this chapter, we analyze the relationship between locations and grades. Our analysis focuses exclusively on data where the grades are A, B, or C, and excludes records with grades labeled as N (Not Yet Graded), Z (Grade Pending), or P (Grade Pending issued upon reopening after a closure). To ensure accuracy and relevance, we begin by filtering out unnecessary data from the dataset before proceeding with the analysis.

# The same data cleaning process in demographics part, also only consider data with grade A, B, C.
manhattan_data = read_csv("Manhattan_Restaurant_Inspection_Results.csv", na = c("NA", "", "."))
str (manhattan_data)
## spc_tbl_ [94,616 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ CAMIS                : num [1:94616] 50140436 50158081 50152703 50160975 50161087 ...
##  $ DBA                  : chr [1:94616] "JUST SALAD" "THE MANNER" "MIDNIGHT BLUE" "BLUE BLOSSOM" ...
##  $ BORO                 : chr [1:94616] "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
##  $ BUILDING             : chr [1:94616] "2853" "58" "106" "108" ...
##  $ STREET               : chr [1:94616] "BROADWAY" "THOMPSON STREET" "EAST   19 STREET" "WEST   39 STREET" ...
##  $ ZIPCODE              : num [1:94616] 10025 10012 10003 10018 10025 ...
##  $ PHONE                : num [1:94616] 7.32e+09 9.17e+09 3.48e+09 6.47e+09 9.29e+09 ...
##  $ CUISINE DESCRIPTION  : chr [1:94616] NA NA NA NA ...
##  $ INSPECTION DATE      : chr [1:94616] "01/01/1900" "01/01/1900" "01/01/1900" "01/01/1900" ...
##  $ ACTION               : chr [1:94616] NA NA NA NA ...
##  $ VIOLATION CODE       : chr [1:94616] NA NA NA NA ...
##  $ VIOLATION DESCRIPTION: chr [1:94616] NA NA NA NA ...
##  $ CRITICAL FLAG        : chr [1:94616] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ SCORE                : num [1:94616] NA NA NA NA NA NA NA NA NA NA ...
##  $ GRADE                : chr [1:94616] NA NA NA NA ...
##  $ GRADE DATE           : chr [1:94616] NA NA NA NA ...
##  $ RECORD DATE          : chr [1:94616] "11/05/2024" "11/05/2024" "11/05/2024" "11/05/2024" ...
##  $ INSPECTION TYPE      : chr [1:94616] NA NA NA NA ...
##  $ Latitude             : num [1:94616] 40.8 40.7 40.7 40.8 40.8 ...
##  $ Longitude            : num [1:94616] -74 -74 -74 -74 -74 ...
##  $ Community Board      : num [1:94616] 109 102 105 105 107 108 101 105 104 102 ...
##  $ Council District     : chr [1:94616] "07" "01" "02" "04" ...
##  $ Census Tract         : chr [1:94616] "019900" "004700" "005000" "011300" ...
##  $ BIN                  : num [1:94616] 1075440 1087362 1017905 1015273 1055676 ...
##  $ BBL                  : num [1:94616] 1.02e+09 1.00e+09 1.01e+09 1.01e+09 1.02e+09 ...
##  $ NTA                  : chr [1:94616] "MN09" "MN24" "MN21" "MN17" ...
##  $ Location Point1      : logi [1:94616] NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   CAMIS = col_double(),
##   ..   DBA = col_character(),
##   ..   BORO = col_character(),
##   ..   BUILDING = col_character(),
##   ..   STREET = col_character(),
##   ..   ZIPCODE = col_double(),
##   ..   PHONE = col_double(),
##   ..   `CUISINE DESCRIPTION` = col_character(),
##   ..   `INSPECTION DATE` = col_character(),
##   ..   ACTION = col_character(),
##   ..   `VIOLATION CODE` = col_character(),
##   ..   `VIOLATION DESCRIPTION` = col_character(),
##   ..   `CRITICAL FLAG` = col_character(),
##   ..   SCORE = col_double(),
##   ..   GRADE = col_character(),
##   ..   `GRADE DATE` = col_character(),
##   ..   `RECORD DATE` = col_character(),
##   ..   `INSPECTION TYPE` = col_character(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double(),
##   ..   `Community Board` = col_double(),
##   ..   `Council District` = col_character(),
##   ..   `Census Tract` = col_character(),
##   ..   BIN = col_double(),
##   ..   BBL = col_double(),
##   ..   NTA = col_character(),
##   ..   `Location Point1` = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

zip_grade = manhattan_data %>%
  janitor::clean_names() %>%  
  filter(
    !is.na(dba),                         
    !is.na(cuisine_description),       
    !is.na(grade),                       
    !is.na(score),                       
    !is.na(zipcode),
    grade %in% c("A", "B", "C")
  ) %>% mutate(region = case_when(
    zipcode >= 10000 & zipcode <= 10025 ~ "Downtown",
    zipcode >= 10026 & zipcode <= 10040 ~ "Midtown",
    zipcode >= 10041 & zipcode <= 10282 ~ "Uptown",
    TRUE ~ "Other" # For ZIP codes like 11371, 12345, etc.
  ))

Chi-Square Test

To investigate if where restaurants are located influence their inspection grades, we produce Chi-Square test. The result is presented in the Contingency Table Heatmap below. The p-value is less than 0.05, leading us to reject the null hypothesis that the region and grade are independent. This indicates a there a significant difference in the distribution of grades among regions at the 0.05 significance level.

# Create a contingency table of zip_code vs. grade
contingency_table <- table(zip_grade$region, zip_grade$grade)

# Perform Chi-Square Test
chi_test <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect

# Convert the contingency table to a matrix
contingency_matrix <- as.matrix(contingency_table) 
contingency_matrix <- contingency_matrix[order(contingency_matrix[, "A"], decreasing = TRUE), ]

# Extract the residuals (contributions to the Chi-Square statistic)
residuals_matrix <- chi_test$residuals


# Extract chi-square test statistic and p-value
chi_stat <- round(chi_test$statistic, 2)
chi_pval <- format.pval(chi_test$p.value, digits = 3)

# Add the chi-square test result as annotations
plot_ly(
  x = colnames(contingency_matrix),
  y = rownames(contingency_matrix),
  z = contingency_matrix,
  type = "heatmap",
  colors = colorRamp(c("white", "purple"))
) %>%
  layout(
    title = "Contingency Table Heatmap with Chi-Square Result",
    xaxis = list(title = "Grade"),
    yaxis = list(title = "Region"),
    colorbar = list(title = "Count"),
    annotations = list(
      list(
        x = 2.1, 
        y = 3,
        text = paste("Chi-Square Statistic:", chi_stat, "<br>P-Value:", chi_pval),
        showarrow = FALSE,
        font = list(size = 14)
      )
    )
  )
## Warning: 'layout' objects don't have these attributes: 'colorbar'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Cramér’s V test

To investigate how strong the association is between located and grades, we produce Cramér’s V test. The Cramér’s V Value is 0.123, which is larger than 0.1 (Weak association) and smaller than 0.3 (Moderate association), so that we conclude they have weak association.
This conclusion is visualized in theree pie charts below of different regions that we can’t tell large difference among them.

# Calculate Cramér’s V
cramer_v <- assocstats(contingency_table)$cramer

# Print Cramér’s V
print(paste("Cramér's V:", round(cramer_v, 3)))
## [1] "Cramér's V: 0.027"

# uptown visualization
uptown <- zip_grade %>%
  filter(region == "Uptown") %>%
  group_by(region, grade) %>%
  summarize(count = n(), .groups = "drop")
uptown_pie = plot_ly(
  data = uptown,
  labels = ~grade,  # Grades (A, B, C)
  values = ~count,  # Counts of each grade
  type = 'pie',
  textinfo = 'label+percent',  # Display labels and percentages
  hoverinfo = 'label+value+percent' , # Show additional info on hover
    colors = "viridis"
) %>%
  layout(
    title = "Uptown" ,
    legend = list(title = list(text = "Grade"))
  )

# Midtown visualization
midtown <- zip_grade %>%
  filter(region == "Midtown") %>%
  group_by(region, grade) %>%
  summarize(count = n(), .groups = "drop")
midtown_pie = plot_ly(
  data = midtown,
  labels = ~grade,  # Grades (A, B, C)
  values = ~count,  # Counts of each grade
  type = 'pie',
  textinfo = 'label+percent',  # Display labels and percentages
  hoverinfo = 'label+value+percent' , # Show additional info on hover
    colors = "viridis"
) %>%
  layout(
    title = "Midtown" ,
    legend = list(title = list(text = "Grade"))
  )

# Downtown visualization
downtown <- zip_grade %>%
  filter(region == "Downtown") %>%
  group_by(region, grade) %>%
  summarize(count = n(), .groups = "drop")
downtown_pie = plot_ly(
  data = downtown,
  labels = ~grade,  # Grades (A, B, C)
  values = ~count,  # Counts of each grade
  type = 'pie',
  textinfo = 'label+percent',  # Display labels and percentages
  hoverinfo = 'label+value+percent' , # Show additional info on hover
    colors = "viridis"
) %>%
  layout(
    title = "Downtown" ,
    legend = list(title = list(text = "Grade"))
  )

# Combine the pie charts into one layout
doc <- htmltools::tagList(
  div(uptown_pie, style = "float:left;width:33.3%;"),
  div(midtown_pie, style = "float:left;width:33.3%;"),
  div(downtown_pie, style = "float:right;width:33.3%;") 
)

htmltools::browsable(doc)